import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
df = pd.read_csv('Gradesv2.csv')
df.head()
Subject Yr Student ID | SGV | Assignment \nTotal: 100 | Test Total: 100 | Individual Presentation \nTotal: 100 | Grp Project\nTotal: 100 | Subject Mark | Subject Grade | Subject Grade Point | Gender | Age | Institute | Programme | Admission GPA | Unnamed: 14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | W19S01 | 0 | 88 | 73.0 | 90.0 | 85 | 84.5 | A | 4.0 | M | 24.0 | Polyu Hkcc | Associate in Information Technology | 3.19 | NaN |
1 | W19S02 | 0 | 40 | 65.0 | 65.0 | 50 | 53.0 | C+ | 2.5 | M | 23.0 | Cityu Community Coll | |Associate Degree of Science in Information Sy... | 2.93 | NaN |
2 | W19S03 | 0 | 44 | 31.0 | 60.0 | 73 | 53.3 | C+ | 2.5 | M | 27.0 | IVE | Higher Diploma in Mobile Applications Development | 2.68 | NaN |
3 | W19S04 | 0 | 96 | 65.0 | 90.0 | 94 | 88.0 | A+ | 4.5 | M | 24.0 | IVE | Higher Diploma in Software Engineering | 3.18 | NaN |
4 | W19S05 | 0 | 58 | 63.0 | 90.0 | 94 | 76.2 | A | 4.0 | M | 23.0 | Polyu HKcc | AD in Information Technology | 2.36 | NaN |
df.columns #Viewing Columns
Index(['Subject Yr Student ID', 'SGV', 'Assignment \nTotal: 100', 'Test Total: 100', 'Individual Presentation \nTotal: 100', 'Grp Project\nTotal: 100', 'Subject Mark', 'Subject Grade', 'Subject Grade Point', 'Gender', 'Age', 'Institute', 'Programme', 'Admission GPA', 'Unnamed: 14'], dtype='object')
df.info() #Viewing information about each column
<class 'pandas.core.frame.DataFrame'> RangeIndex: 131 entries, 0 to 130 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Subject Yr Student ID 131 non-null object 1 SGV 131 non-null int64 2 Assignment Total: 100 131 non-null int64 3 Test Total: 100 131 non-null float64 4 Individual Presentation Total: 100 131 non-null float64 5 Grp Project Total: 100 131 non-null int64 6 Subject Mark 131 non-null float64 7 Subject Grade 131 non-null object 8 Subject Grade Point 131 non-null float64 9 Gender 131 non-null object 10 Age 129 non-null float64 11 Institute 128 non-null object 12 Programme 128 non-null object 13 Admission GPA 128 non-null object 14 Unnamed: 14 3 non-null object dtypes: float64(5), int64(3), object(7) memory usage: 15.5+ KB
percent_missing = df.isnull().sum() * 100 / len(df)
percent_missing = round(percent_missing, 2)
percent_missing = percent_missing.astype(str) + '%'
percent_missing
Subject Yr Student ID 0.0% SGV 0.0% Assignment \nTotal: 100 0.0% Test Total: 100 0.0% Individual Presentation \nTotal: 100 0.0% Grp Project\nTotal: 100 0.0% Subject Mark 0.0% Subject Grade 0.0% Subject Grade Point 0.0% Gender 0.0% Age 1.53% Institute 2.29% Programme 2.29% Admission GPA 2.29% Unnamed: 14 97.71% dtype: object
dup_percentage = df.duplicated().sum()/len(df)*100
dup_percentage = round(dup_percentage,2)
dup_percentage = dup_percentage.astype(str) + '%'
dup_percentage
'0.0%'
# Dropping row with high missing values
df = df.drop('Unnamed: 14', 1)
# Renaming Columns to be more meaningful
df.rename(columns = {'Subject Yr Student ID':'ID','Test Total: 100':'Test'}, inplace = True)
df.rename(columns = {'Assignment \nTotal: 100':'Assignment'}, inplace = True)
df.rename(columns = {'Individual Presentation \nTotal: 100':'Individual Presentation'}, inplace = True)
df.rename(columns = {'Grp Project\nTotal: 100':'Grp Project'}, inplace = True)
# Arranging Columns
cols = ['Subject Yr Student ID','SGV', 'Assignment', 'Test', 'Individual Presentation', 'Grp Project',
'Subject Mark', 'Subject Grade', 'Subject Grade Point', 'Gender', 'Age', 'Institute', 'Programme',
'Admission GPA']
# simply drop whole row with NaN in "Age" column
df.dropna(subset=["Institute","Programme"], axis=0, inplace=True)
# reset index, because we droped
df.reset_index(drop=True, inplace=True)
# Checking for missing values after cleaning
percent_missing = df.isnull().sum() * 100 / len(df)
percent_missing = round(percent_missing, 2)
percent_missing = percent_missing.astype(str) + '%'
percent_missing
ID 0.0% SGV 0.0% Assignment 0.0% Test 0.0% Individual Presentation 0.0% Grp Project 0.0% Subject Mark 0.0% Subject Grade 0.0% Subject Grade Point 0.0% Gender 0.0% Age 0.0% Institute 0.0% Programme 0.0% Admission GPA 0.0% dtype: object
df.dtypes # Checking Datatypes
ID object SGV int64 Assignment int64 Test float64 Individual Presentation float64 Grp Project int64 Subject Mark float64 Subject Grade object Subject Grade Point float64 Gender object Age float64 Institute object Programme object Admission GPA object dtype: object
df['Age'] = df['Age'].astype('Int64') # Setting Datatype
df['Admission GPA'] = pd.to_numeric(df['Admission GPA'], errors='coerce').astype('float64') # Setting Datatype
#Admission GPA rounding up gpa to 1 D.P
df['Admission GPA'].round(decimals = 1)
0 3.2 1 2.9 2 2.7 3 3.2 4 2.4 ... 123 1.6 124 2.8 125 3.0 126 2.7 127 2.4 Name: Admission GPA, Length: 128, dtype: float64
df.head()
ID | SGV | Assignment | Test | Individual Presentation | Grp Project | Subject Mark | Subject Grade | Subject Grade Point | Gender | Age | Institute | Programme | Admission GPA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | W19S01 | 0 | 88 | 73.0 | 90.0 | 85 | 84.5 | A | 4.0 | M | 24 | Polyu Hkcc | Associate in Information Technology | 3.19 |
1 | W19S02 | 0 | 40 | 65.0 | 65.0 | 50 | 53.0 | C+ | 2.5 | M | 23 | Cityu Community Coll | |Associate Degree of Science in Information Sy... | 2.93 |
2 | W19S03 | 0 | 44 | 31.0 | 60.0 | 73 | 53.3 | C+ | 2.5 | M | 27 | IVE | Higher Diploma in Mobile Applications Development | 2.68 |
3 | W19S04 | 0 | 96 | 65.0 | 90.0 | 94 | 88.0 | A+ | 4.5 | M | 24 | IVE | Higher Diploma in Software Engineering | 3.18 |
4 | W19S05 | 0 | 58 | 63.0 | 90.0 | 94 | 76.2 | A | 4.0 | M | 23 | Polyu HKcc | AD in Information Technology | 2.36 |
df['Programme'].value_counts() # Counting values in each column
Associate in Information Technology 40 Associate of Science in Information Systems Development 8 Higher Diploma in Software Engineering 5 Associate of Engineering 4 Higher Diploma in Telecommunications and Networking 4 Associate in Engineering 4 Associate in Business 3 Higher Diploma in Information Technology 3 AD in Information Systems Development 2 Higher Diploma in Cloud and Data Centre Administration 2 Higher Diploma in Computer Studies 2 Higher Diploma in Mobile Applications Development 2 Associate Degree Mobile Information Technology 2 Associate in Degree Engineering 1 Associate Degree in Mobile Information Technology 1 Associate Degree in Construction Engineering and Management 1 Higher Diploma in Ophthalmic Dispensing 1 Higher Diploma in Multimedia 1 HD in Cloud and Data Centre Administration 1 Higher Diploma in Computer Network Technology 1 Associate Degree Information Systems Development 1 Ascisd 1 Associate of Social Sciences (Sociology Theme) 1 |Associate of Science in Network and Systems Administration 1 Higher Diploma on Hotel Management 1 HD in Electronic and Information Engineering 1 Advance Certificate in Games Development (Game Design) 1 Associate of Arts in Media, Cultural and Creative Studies 1 Higher Diploma in Dispensing Studies 1 Higher Diploma in Aviation Studies 1 Associate in Statistics and Data Science 1 Bachelor of Information Technology 1 Higher Diploma Programme in Computer Game Development 1 Associate of Science in Network and Systems Administration 1 Hd in Mobile Application Development and Cloud Services 1 Associate of Information Systems Development 1 Associate in Information System Development 1 Associate of Science in Business Analysis 1 HD in Games and Animation 1 Associate in Language and Culture 1 Associate of Business Administration in General Management 1 |Associate Degree of Science in Information System Development 1 Associate of Science in Surveying 1 Bachelor's degree in Social Sciences (Honours) in Psychology 1 Associate of NSA 1 Associate of Science 1 HD in Telecommunications and Networking 1 HD in Information Technology 1 HD in Tourism and Events Management 1 Bachelor of Social Sciences 1 HD in Systems Development & Administration 1 Higher Diploma in Automotive Engineering 1 HD in Software Engineering 1 AD in Netwrok and System Administration 1 Associate in Applied Social Sciences| 1 HD in Computer Game Development 1 Higher Diploma in Aircraft Maintenance Engineering 1 AD in Mobile Information Technology 1 AD in Film, Television and Digital Media Studies 1 AD in Information Technology 1 Name: Programme, dtype: int64
df['Programme'] = df.Programme.str.replace('|','')
df['Programme'] = df.Programme.str.replace('AD','Associate Degree')
df['Programme'] = df.Programme.str.replace('Associate in','Associate Degree in')
df['Programme'] = df.Programme.str.replace('Associate of','Associate Degree in')
df['Programme'] = df.Programme.str.replace('HD','Higher Diploma in')
df['Programme'] = df.Programme.str.replace('Hd','Higher Diploma in')
df['Programme'] = df.Programme.str.replace('in in','in')
df['Programme'] = df.Programme.str.replace('Netwrok','Network')
df = df.sort_values(by=['Programme'])
df['Programme'].value_counts()
Associate Degree in Information Technology 41 Associate Degree in Science in Information Systems Development 8 Associate Degree in Engineering 8 Higher Diploma in Software Engineering 6 Higher Diploma in Telecommunications and Networking 5 Higher Diploma in Information Technology 4 Associate Degree in Business 3 Associate Degree in Information Systems Development 3 Higher Diploma in Cloud and Data Centre Administration 3 Associate Degree in Science in Network and Systems Administration 2 Higher Diploma in Mobile Applications Development 2 Associate Degree Mobile Information Technology 2 Higher Diploma in Computer Studies 2 Associate Degree in Mobile Information Technology 2 Associate Degree in NSA 1 Higher Diploma on Hotel Management 1 Ascisd 1 Higher Diploma in Multimedia 1 Associate Degree in Network and System Administration 1 Associate Degree in Language and Culture 1 Higher Diploma in Ophthalmic Dispensing 1 Associate Degree in Construction Engineering and Management 1 Bachelor of Information Technology 1 Higher Diploma in Dispensing Studies 1 Associate Degree in Degree Engineering 1 Higher Diploma in Games and Animation 1 Higher Diploma in Computer Network Technology 1 Advance Certificate in Games Development (Game Design) 1 Associate Degree in Business Administration in General Management 1 Higher Diploma Programme in Computer Game Development 1 Associate Degree in Applied Social Sciences 1 Higher Diploma in Tourism and Events Management 1 Associate Degree in Statistics and Data Science 1 Associate Degree in Science in Business Analysis 1 Associate Degree in Film, Television and Digital Media Studies 1 Associate Degree in Information System Development 1 Higher Diploma in Aviation Studies 1 Associate Degree in Science in Surveying 1 Bachelor of Social Sciences 1 Higher Diploma in Mobile Application Development and Cloud Services 1 Associate Degree in Social Sciences (Sociology Theme) 1 Higher Diploma in Aircraft Maintenance Engineering 1 Higher Diploma in Automotive Engineering 1 Bachelor's degree in Social Sciences (Honours) in Psychology 1 Associate Degree of Science in Information System Development 1 Higher Diploma in Computer Game Development 1 Higher Diploma in Electronic and Information Engineering 1 Associate Degree Information Systems Development 1 Higher Diploma in Systems Development & Administration 1 Associate Degree in Science 1 Associate Degree in Arts in Media, Cultural and Creative Studies 1 Name: Programme, dtype: int64
df['Degree'] = df.Programme.str.replace(r'(^.*Associate Degree.*$)', 'Associate Degree')
df['Degree'] = df.Degree.str.replace(r'(^.*Higher Diploma.*$)', 'Higher Diploma')
df['Degree'] = df.Degree.str.replace(r'(^.*Bachelor.*$)', 'Others')
df['Degree'] = df.Degree.str.replace(r'(^.*Ascisd.*$)', 'Others')
df['Degree'] = df.Degree.str.replace(r'(^.*Advance Certificate.*$)', 'Others')
df['Degree'].value_counts()
Associate Degree 86 Higher Diploma 37 Others 5 Name: Degree, dtype: int64
df['Category'] = df.Programme.str.replace(r'(^.*Information.*$)', 'IT')
df['Category'] = df.Category.str.replace(r'(^.*Network.*$)', 'Network')
df['Category'] = df.Category.str.replace(r'(^.*Software.*$)', 'IT')
df['Category'] = df.Category.str.replace(r'(^.*Mobile.*$)', 'IT')
df['Category'] = df.Category.str.replace(r'(^.*Cloud and Data.*$)', 'IT')
df['Category'] = df.Category.str.replace(r'(^.*Computer.*$)', 'IT')
df['Category'] = df.Category.str.replace(r'(^.*Systems.*$)', 'IT')
df['Category'] = df.Category.str.replace(r'(^.*Games.*$)', 'Games')
df['Category'] = df.Category.str.replace(r'(^.*Engineering.*$)', 'Engineering')
df['Category'] = df.Category.str.replace(r'(^.*Business.*$)', 'Business')
df['Category'] = df.Category.str.replace(r'(^.*Science.*$)', 'Science')
df['Category'] = df.Category.str.replace(r'(^.*Media.*$)', 'Media ')
df['Category'] = df.Category.str.replace(r'(^.*Multimedia.*$)', 'Media ')
df['Category'] = df.Category.str.replace(r'(^.*Ophthalmic.*$)', 'Others ')
df['Category'] = df.Category.str.replace(r'(^.*Management.*$)', 'Others ')
df['Category'] = df.Category.str.replace(r'(^.*NSA.*$)', 'Others ')
df['Category'] = df.Category.str.replace(r'(^.*Studies.*$)', 'Others ')
df['Category'] = df.Category.str.replace(r'(^.*Culture.*$)', 'Others ')
df['Category'] = df.Category.str.replace(r'(^.*Ascisd.*$)', 'Others ')
df['Category'].value_counts()
IT 82 Engineering 12 Network 9 Others 8 Science 7 Business 5 Media 3 Games 2 Name: Category, dtype: int64
df.head()
ID | SGV | Assignment | Test | Individual Presentation | Grp Project | Subject Mark | Subject Grade | Subject Grade Point | Gender | Age | Institute | Programme | Admission GPA | Degree | Category | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
49 | W20St09 | 1 | 94 | 100.0 | 72.9 | 76 | 85.6 | A | 4.0 | M | 23 | Digipen Institute of Tecnology Singapore | Advance Certificate in Games Development (Game... | 3.12 | Others | Games |
19 | W19S20 | 0 | 48 | 71.0 | 30.0 | 61 | 52.9 | C+ | 2.5 | M | 23 | Cityu Community Coll | Ascisd | 2.42 | Others | Others |
121 | W21S44 | 1 | 74 | 98.0 | 59.0 | 70 | 75.0 | B+ | 3.3 | M | 20 | Cityu Community Coll | Associate Degree Information Systems Development | 2.88 | Associate Degree | IT |
88 | W21S09 | 1 | 70 | 98.0 | 23.0 | 60 | 63.0 | C+ | 2.3 | M | 20 | Hkbu Cie | Associate Degree Mobile Information Technology | 2.24 | Associate Degree | IT |
115 | W21S38 | 1 | 73 | 100.0 | 43.0 | 60 | 69.0 | B- | 2.7 | M | 20 | Hkbu Cie | Associate Degree Mobile Information Technology | 2.72 | Associate Degree | IT |
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 128 entries, 49 to 123 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 128 non-null object 1 SGV 128 non-null int64 2 Assignment 128 non-null int64 3 Test 128 non-null float64 4 Individual Presentation 128 non-null float64 5 Grp Project 128 non-null int64 6 Subject Mark 128 non-null float64 7 Subject Grade 128 non-null object 8 Subject Grade Point 128 non-null float64 9 Gender 128 non-null object 10 Age 128 non-null Int64 11 Institute 128 non-null object 12 Programme 128 non-null object 13 Admission GPA 126 non-null float64 14 Degree 128 non-null object 15 Category 128 non-null object dtypes: Int64(1), float64(5), int64(3), object(7) memory usage: 17.1+ KB
df['Institute'].value_counts()
PolyU HKCC 52 IVE 24 CityU Community College 24 HKU Space 11 Hkbu Cie 6 Cuhk Scs 3 HK College of Technology 2 Cityu 1 University of South Australia 1 Polyu 1 Suzhou Chien-Shiung Institute of Technology 1 Digipen Institute of Tecnology Singapore 1 Polyu Speed 1 Name: Institute, dtype: int64
df['Institute'] = df.Institute.str.replace(r'(^.*Polyu Hkcc.*$)', 'PolyU HKCC')
df['Institute'] = df.Institute.str.replace(r'(^.*Polyu HKcc.*$)', 'PolyU HKCC')
df['Institute'] = df.Institute.str.replace(r'(^.*Cityu Community Coll.*$)', 'CityU Community College')
df['Institute'] = df.Institute.str.replace(r'(^.*Hku Space.*$)', 'HKU Space')
df['Institute'] = df.Institute.str.replace(r'(^.*HKBU CIE.*$)', 'Hkbu Cie')
df['Institute'] = df.Institute.str.replace(r'(^.*Hkbu Cie.*$)', 'Hkbu Cie')
df['ID'] = le.fit_transform(df['ID'])
df['SGV'] = le.fit_transform(df['SGV'])
df['Assignment'] = le.fit_transform(df['Assignment'])
df['Test'] = le.fit_transform(df['Test'])
df['Individual Presentation'] = le.fit_transform(df['Individual Presentation'])
df['Grp Project'] = le.fit_transform(df['Grp Project'])
df['Subject Mark'] = le.fit_transform(df['Subject Mark'])
df['Subject Grade'] = le.fit_transform(df['Subject Grade'])
df['Subject Grade Point'] = le.fit_transform(df['Subject Grade Point'])
df['Gender'] = le.fit_transform(df['Gender'])
df['Admission GPA'] = le.fit_transform(df['Admission GPA'])
df.to_csv('final.csv')